[PAE] Allow pgdirs above 4GB for paravirt guests.
authorkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 26 May 2006 16:22:30 +0000 (17:22 +0100)
committerkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 26 May 2006 16:22:30 +0000 (17:22 +0100)
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
tools/libxc/xc_linux_build.c
tools/libxc/xc_linux_restore.c
tools/libxc/xc_private.c
tools/libxc/xenctrl.h
xen/arch/x86/domain_build.c
xen/arch/x86/mm.c
xen/common/kernel.c
xen/include/asm-x86/domain.h

index 1dcf27daf0bc8db1edcd2a29dcfda149e0c8b2cd..260b03d33e00547e23c68b10f7741c76a030a432 100644 (file)
@@ -268,22 +268,11 @@ static int setup_pg_tables_pae(int xc_handle, uint32_t dom,
     l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
     l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
     uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
-    unsigned long ppt_alloc, count, nmfn;
+    unsigned long ppt_alloc, count;
 
     /* First allocate page for page dir. */
     ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
 
-    if ( page_array[ppt_alloc] > 0xfffff )
-    {
-        nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
-        if ( nmfn == 0 )
-        {
-            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
-            goto error_out;
-        }
-        page_array[ppt_alloc] = nmfn;
-    }
-
     alloc_pt(l3tab, vl3tab, pl3tab);
     vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
     if (shadow_mode_enabled)
index 9bf283652c6fe6267cb7e28092e87fda75edbe65..353e0c7d6309599cdebe2882bceeffa56b7dc016 100644 (file)
@@ -331,25 +331,17 @@ int xc_linux_restore(int xc_handle, int io_fd,
                 ** A page table page - need to 'uncanonicalize' it, i.e.
                 ** replace all the references to pfns with the corresponding
                 ** mfns for the new domain.
-                **
-                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
-                ** so we may need to update the p2m after the main loop.
-                ** Hence we defer canonicalization of L1s until then.
                 */
-                if(pt_levels != 3 || pagetype != L1TAB) {
-
-                    if(!uncanonicalize_pagetable(pagetype, page)) {
-                        /*
-                        ** Failing to uncanonicalize a page table can be ok
-                        ** under live migration since the pages type may have
-                        ** changed by now (and we'll get an update later).
-                        */
-                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                                pagetype >> 28, pfn, mfn);
-                        nraces++;
-                        continue;
-                    }
-
+                if(!uncanonicalize_pagetable(pagetype, page)) {
+                    /*
+                    ** Failing to uncanonicalize a page table can be ok
+                    ** under live migration since the pages type may have
+                    ** changed by now (and we'll get an update later).
+                    */
+                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                            pagetype >> 28, pfn, mfn);
+                    nraces++;
+                    continue;
                 }
 
             } else if(pagetype != NOTAB) {
@@ -398,100 +390,6 @@ int xc_linux_restore(int xc_handle, int io_fd,
 
     DPRINTF("Received all pages (%d races)\n", nraces);
 
-    if(pt_levels == 3) {
-
-        /*
-        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
-        ** is a little awkward and involves (a) finding all such PGDs and
-        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
-        ** with the new info; and (c) canonicalizing all the L1s using the
-        ** (potentially updated) p2m[].
-        **
-        ** This is relatively slow (and currently involves two passes through
-        ** the pfn_type[] array), but at least seems to be correct. May wish
-        ** to consider more complex approaches to optimize this later.
-        */
-
-        int j, k;
-
-        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
-        for (i = 0; i < max_pfn; i++) {
-
-            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
-
-                unsigned long new_mfn;
-                uint64_t l3ptes[4];
-                uint64_t *l3tab;
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ, p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3ptes[j] = l3tab[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
-                    ERR("Couldn't get a page below 4GB :-(");
-                    goto out;
-                }
-
-                p2m[i] = new_mfn;
-                if (xc_add_mmu_update(xc_handle, mmu,
-                                      (((unsigned long long)new_mfn)
-                                       << PAGE_SHIFT) |
-                                      MMU_MACHPHYS_UPDATE, i)) {
-                    ERR("Couldn't m2p on PAE root pgdir");
-                    goto out;
-                }
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ | PROT_WRITE, p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3tab[j] = l3ptes[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-            }
-        }
-
-        /* Second pass: find all L1TABs and uncanonicalize them */
-        j = 0;
-
-        for(i = 0; i < max_pfn; i++) {
-
-            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
-                region_mfn[j] = p2m[i];
-                j++;
-            }
-
-            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
-
-                if (!(region_base = xc_map_foreign_batch(
-                          xc_handle, dom, PROT_READ | PROT_WRITE,
-                          region_mfn, j))) {
-                    ERR("map batch failed");
-                    goto out;
-                }
-
-                for(k = 0; k < j; k++) {
-                    if(!uncanonicalize_pagetable(L1TAB,
-                                                 region_base + k*PAGE_SIZE)) {
-                        ERR("failed uncanonicalize pt!");
-                        goto out;
-                    }
-                }
-
-                munmap(region_base, j*PAGE_SIZE);
-                j = 0;
-            }
-        }
-
-    }
-
 
     if (xc_finish_mmu_updates(xc_handle, mmu)) {
         ERR("Error doing finish_mmu_updates()");
index f776d2ff0a90ce034af3382a0ef4aeb9666e83b6..0c7df9f83f7017a87304f83a37ca03abf0f55318 100644 (file)
@@ -430,28 +430,6 @@ int xc_version(int xc_handle, int cmd, void *arg)
     return rc;
 }
 
-unsigned long xc_make_page_below_4G(
-    int xc_handle, uint32_t domid, unsigned long mfn)
-{
-    unsigned long new_mfn;
-
-    if ( xc_domain_memory_decrease_reservation(
-        xc_handle, domid, 1, 0, &mfn) != 0 )
-    {
-        fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
-        return 0;
-    }
-
-    if ( xc_domain_memory_increase_reservation(
-        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
-    {
-        fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
-        return 0;
-    }
-
-    return new_mfn;
-}
-
 /*
  * Local variables:
  * mode: C
index aff3ee69f3c25b3233ae8501f9e1b301d2ec76c9..056f9902bc1b00bd00ecc7ad9c5c946260de537f 100644 (file)
@@ -453,9 +453,6 @@ int xc_domain_iomem_permission(int xc_handle,
                                unsigned long nr_mfns,
                                uint8_t allow_access);
 
-unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
-                                    unsigned long mfn);
-
 typedef dom0_perfc_desc_t xc_perfc_desc_t;
 /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
 int xc_perfc_control(int xc_handle,
index ed839787990fcf142fc7bade8fed1d4d1e24adcf..309ddef558363dc94ba6cd8f1bb470d1e99ed03b 100644 (file)
@@ -367,7 +367,10 @@ int construct_dom0(struct domain *d,
     if ( (1UL << order) > nr_pages )
         panic("Domain 0 allocation is too small for kernel image.\n");
 
-    /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
+    /*
+     * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
+     * mapping covers the allocation.
+     */
     if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
         panic("Not enough RAM for domain 0 allocation.\n");
     alloc_spfn = page_to_mfn(page);
index 0b9cb8e1b3a31d0fa4f66168c6e47461b1acba77..861e7080d1e86b05b9acda99f41da35abfff440b 100644 (file)
@@ -260,9 +260,42 @@ void share_xen_page_with_privileged_guests(
     share_xen_page_with_guest(page, dom_xen, readonly);
 }
 
+static void __write_ptbase(unsigned long mfn)
+{
+#ifdef CONFIG_X86_PAE
+    if ( mfn >= 0x100000 )
+    {
+        l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
+        struct vcpu *v = current;
+        unsigned long flags;
+
+        /* Protects against re-entry and against __pae_flush_pgd(). */
+        local_irq_save(flags);
+
+        /* Pick an unused low-memory L3 cache slot. */
+        v->arch.lowmem_l3tab_inuse ^= 1;
+        lowmem_l3tab = v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse];
+        v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] = mfn;
+
+        /* Map the guest L3 table and copy to the chosen low-memory cache. */
+        highmem_l3tab = map_domain_page(mfn);
+        memcpy(lowmem_l3tab, highmem_l3tab, sizeof(v->arch.lowmem_l3tab));
+        unmap_domain_page(highmem_l3tab);
+
+        /* Install the low-memory L3 table in CR3. */
+        write_cr3(__pa(lowmem_l3tab));
+
+        local_irq_restore(flags);
+        return;
+    }
+#endif
+
+    write_cr3(mfn << PAGE_SHIFT);
+}
+
 void write_ptbase(struct vcpu *v)
 {
-    write_cr3(pagetable_get_paddr(v->arch.monitor_table));
+    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
 }
 
 void invalidate_shadow_ldt(struct vcpu *v)
@@ -401,6 +434,7 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr,
     return 1;
 }
 
+#ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
 /*
  * We allow root tables to map each other (a.k.a. linear page tables). It
  * needs some special care with reference counts and access permissions:
@@ -456,6 +490,7 @@ get_linear_pagetable(
 
     return 1;
 }
+#endif /* !CONFIG_X86_PAE */
 
 int
 get_page_from_l1e(
@@ -564,10 +599,6 @@ get_page_from_l3e(
     rc = get_page_and_type_from_pagenr(
         l3e_get_pfn(l3e),
         PGT_l2_page_table | vaddr, d);
-#if CONFIG_PAGING_LEVELS == 3
-    if ( unlikely(!rc) )
-        rc = get_linear_pagetable(l3e, pfn, d);
-#endif
     return rc;
 }
 #endif /* 3 level */
@@ -773,6 +804,50 @@ static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
     return 1;
 }
 
+struct pae_flush_pgd {
+    unsigned long l3tab_mfn;
+    unsigned int  l3tab_idx;
+    l3_pgentry_t  nl3e;
+};
+
+static void __pae_flush_pgd(void *data)
+{
+    struct pae_flush_pgd *args = data;
+    struct vcpu *v = this_cpu(curr_vcpu);
+    int i = v->arch.lowmem_l3tab_inuse;
+    intpte_t _ol3e, _nl3e, _pl3e;
+    l3_pgentry_t *l3tab_ptr;
+
+    ASSERT(!local_irq_is_enabled());
+
+    if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn )
+        return;
+
+    l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx];
+
+    _ol3e = l3e_get_intpte(*l3tab_ptr);
+    _nl3e = l3e_get_intpte(args->nl3e);
+    _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
+    BUG_ON(_pl3e != _ol3e);
+}
+
+/* Flush a pgdir update into low-memory caches. */
+static void pae_flush_pgd(
+    unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
+{
+    struct domain *d = page_get_owner(mfn_to_page(mfn));
+    struct pae_flush_pgd args = {
+        .l3tab_mfn = mfn,
+        .l3tab_idx = idx,
+        .nl3e      = nl3e };
+
+    /* If below 4GB then the pgdir is not shadowed in low memory. */
+    if ( mfn < 0x100000 )
+        return;
+
+    on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args, 1, 1);
+}
+
 static inline int l1_backptr(
     unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
 {
@@ -787,6 +862,7 @@ static inline int l1_backptr(
 
 #elif CONFIG_X86_64
 # define create_pae_xen_mappings(pl3e) (1)
+# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
 
 static inline int l1_backptr(
     unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
@@ -886,14 +962,6 @@ static int alloc_l3_table(struct page_info *page, unsigned long type)
 
     ASSERT(!shadow_mode_refcounts(d));
 
-#ifdef CONFIG_X86_PAE
-    if ( pfn >= 0x100000 )
-    {
-        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
-        return 0;
-    }
-#endif
-
     pl3e = map_domain_page(pfn);
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
     {
@@ -1241,6 +1309,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
     okay = create_pae_xen_mappings(pl3e);
     BUG_ON(!okay);
 
+    pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
+
     put_page_from_l3e(ol3e, pfn);
     return 1;
 }
@@ -3109,7 +3179,7 @@ void ptwr_flush(struct domain *d, const int which)
 
     if ( unlikely(d->arch.ptwr[which].vcpu != current) )
         /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
-        write_cr3(pagetable_get_paddr(
+        __write_ptbase(pagetable_get_pfn(
             d->arch.ptwr[which].vcpu->arch.guest_table));
     else
         TOGGLE_MODE();
index e83d50a273308936554f990b4461451694f7c4d9..480138a06590da44ec7ff3e494e994b7f8b39295 100644 (file)
@@ -191,12 +191,11 @@ long do_xen_version(int cmd, XEN_GUEST_HANDLE(void) arg)
         switch ( fi.submap_idx )
         {
         case 0:
-            fi.submap = 0;
+            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
             if ( shadow_mode_translate(current->domain) )
                 fi.submap |= 
                     (1U << XENFEAT_writable_page_tables) |
-                    (1U << XENFEAT_auto_translated_physmap) |
-                    (1U << XENFEAT_pae_pgdir_above_4gb);
+                    (1U << XENFEAT_auto_translated_physmap);
             if ( supervisor_mode_kernel )
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
             break;
index 6581a51b4c498984ef346aca2bdf2f08af8ac582..2628a3a219e59995bc61e9c32c3292fc896fe32d 100644 (file)
@@ -120,6 +120,18 @@ struct arch_vcpu
     struct vcpu_guest_context guest_context
     __attribute__((__aligned__(16)));
 
+#ifdef CONFIG_X86_PAE
+    /*
+     * Two low-memory (<4GB) PAE L3 tables, used as fallback when the guest
+     * supplies a >=4GB PAE L3 table. We need two because we cannot set up
+     * an L3 table while we are currently running on it (without using
+     * expensive atomic 64-bit operations).
+     */
+    l3_pgentry_t  lowmem_l3tab[2][4] __attribute__((__aligned__(32)));
+    unsigned long lowmem_l3tab_high_mfn[2]; /* The >=4GB MFN being shadowed. */
+    unsigned int  lowmem_l3tab_inuse;       /* Which lowmem_l3tab is in use? */
+#endif
+
     unsigned long      flags; /* TF_ */
 
     void (*schedule_tail) (struct vcpu *);